##This is the code to conduct the merge of employee data to NYS voter file

library(feather) 
library(data.table)
library(lubridate)

#######################################
#####Step 1: I/O and initial formatting
##########################################
#First Load voter rolls
voters <- fread("AllNYSVoters2018.txt", header=FALSE)
colnames(voters) <- c("lastname", "firstname", "middlename", "namesuffix", "raddnumber", "rhalfcode",
                     "rapartment", "rstreetname", "rpredirection","rpostdirection", "rcity", "rzip5", "rzip4", "mailadd1",
                      "mailadd2", "mailadd3", "mailadd4","dob", "gender", "enrollment", "otherparty", 
                      "countycode","ed", "ld", "towncity", "ward", "cd", "sd",
                      "ad", "lastvoteddate", "prevyearvoted", "prevcounty", "prevaddress", "prevname",
                      "countynumber", "regdate", "vrsource", "idrequired", "idmet", "status",
                      "reasoncode", "inact_date", "purge_date", "sboeid", "voterhistory")

#subset to just nyc voters based on county
voters <- subset(voters, (countycode %in% c("3", "31", "24", "41", "43")) | (prevcounty %in% c("3", "31", "24", "41", "43")))

#recode county numbers to counties for merge
voters$county <- voters$countycode
voters$county[voters$county == "3"] <- "BRONX"
voters$county[voters$county == "31"] <- "MANHATTAN"
voters$county[voters$county == "24"] <- "BROOKLYN"
voters$county[voters$county == "41"] <- "QUEENS"
voters$county[voters$county == "43"] <- "RICHMOND"

#Add voter id
voters$idVoters <- 1:nrow(voters)

#clean up vote history, make voted in year x variables
voters$voterhistory <- toupper(gsub(",", "", voters$voterhistory)) #first standardize text

#generate the possible indicators of having voted
options17 <- c("20171107 GE", "GE 20171107", "2017 GENERAL ELECTION", "GENERAL 2017", "GENERAL ELECTION 2017") #note this is only on the 2018 voter file. 
options16 <- c("20161108 GE", "GE 20161108", "2016 GENERAL ELECTION", "GENERAL 2016", "GENERAL ELECTION 2016")
options14 <- c("20141104 GE", "GE 20141104", "2014 GENERAL ELECTION", "GENERAL 2014", "GENERAL ELECTION 2014")
options13 <- c("20131105 GE", "GE 20131105", "2013 GENERAL ELECTION", "GENERAL 2013", "GENERAL ELECTION 2013")
options12 <- c("20121106 GE", "GE 20121106", "2012 GENERAL ELECTION", "GENERAL 2012", "GENERAL ELECTION 2012")
options10 <- c("20101102 GE", "GE 20101102", "2010 GENERAL ELECTION", "GENERAL 2010", "GENERAL ELECTION 2010")
options08 <- c("20081104 GE", "GE 20081104", "2008 GENERAL ELECTION", "GENERAL 2008", "GENERAL ELECTION 2008")
options06 <- c("2006 GENERAL ELECTION", "GENERAL 2006", "GENERAL ELECTION 2006")
options04 <- c("2004 GENERAL ELECTION", "GENERAL 2004", "GENERAL ELECTION 2004")
options02 <- c("2002 GENERAL ELECTION", "GENERAL 2002", "GENERAL ELECTION 2002")



findvotehist <- function(votehistory, optionslist){#
	histsplit <- unlist(strsplit(votehistory, ";")) #split vote history by semicolon
	overlap <- intersect(histsplit, optionslist)
	votedyear <- ifelse(length(overlap)>0, 1, 0)	
	return(votedyear)}

voters$voted17 <- sapply(voters$voterhistory,findvotehist, options17)
voters$voted16 <- sapply(voters$voterhistory,findvotehist, options16)
voters$voted14 <- sapply(voters$voterhistory,findvotehist, options14)
voters$voted13 <- sapply(voters$voterhistory,findvotehist, options13)
voters$voted12 <- sapply(voters$voterhistory,findvotehist, options12)
voters$voted10 <- sapply(voters$voterhistory,findvotehist, options10)
voters$voted08 <- sapply(voters$voterhistory,findvotehist, options08)
voters$voted06 <- sapply(voters$voterhistory,findvotehist, options06)
voters$voted04 <- sapply(voters$voterhistory,findvotehist, options04)
voters$voted02 <- sapply(voters$voterhistory,findvotehist, options02)


voters$purgedate <- ymd(voters$purge_date)
voters[, datereg := ymd(regdate)]
regby2012 <- voters[datereg <= "2012-10-12" & (is.na(purgedate) | purgedate >= "2016-12-01") , ]
summary(regby2012$voted12);summary(regby2012$voted16) 

#Save it again
write_feather(voters, "All_Voters NYC Only.feather")
voterfilerows <- nrow(voters) 

##Import Employees
employees <- fread("NycEmployees.csv")

colnames(employees) <- gsub(" ", "", colnames(employees))

#some borough names are lowercase. We should fix that
employees$WorkLocationBorough <- toupper(employees$WorkLocationBorough)

#Also, change salaries to numeric format
employees$BaseSalary <- gsub("\\$", "", employees$BaseSalary)
employees$BaseSalary <- as.numeric(employees$BaseSalary)

#Id for employees -- only unique for fiscal year/ employee pairs
employees$idEmployees <- 1:nrow(employees)

#Make a full name variable
employees$EmployeeFull <- paste(employees$FirstName, employees$MidInit, employees$LastName)
employees$EmployeeFirstLast <- paste(employees$FirstName, employees$LastName)

#There's inconsistent capitalization in agency names and titles. Let's fix that
employees$AgencyName <- toupper(employees$AgencyName)
employees$TitleDescription <- toupper(employees$TitleDescription)

setkey(employees, "LastName", "FirstName", "MidInit", "AgencyName", "TitleDescription")
employees[WorkLocationBorough=="", WorkLocationBorough:=NA]

employees[, countyfilled:= unique(WorkLocationBorough[!is.na(WorkLocationBorough)]), by=list(LastName, FirstName, MidInit, AgencyName, TitleDescription)]

setnames(employees, "LeaveStatusasofJune30", "LeaveStatus") 

#now pull in the earlier (2011-2013) payroll data and line it up with the above data before reshaping.
load("NYCpayroll20112013.Rdata") #set up in "NYCrecords20112013.R"

head(oldpayrolldata) 
table(oldpayrolldata$filename)
oldpayrolldata[, FiscalYear := substr(gsub("./Data/NYC20112013/FY", "", filename, fixed=T), 0, 4)] #pull fiscal year out of filenames data was pulled from 
setnames(oldpayrolldata, "Agency", "AgencyName") #line up colnames
oldpayrolldata[is.na(Title), Title := TitleDescription]
oldpayrolldata[, TitleDescription := NULL] #combine columns and then drop less-complete one
setnames(oldpayrolldata, "Title", "TitleDescription") #line up colnames
oldpayrolldata[, AgencyStartDate := as.character(format(AgencyStartDate, "%m/%d/%Y"))]
oldpayrolldata <- oldpayrolldata[,-18] #drop the duplicated (and nearly empty) LeaveStatus column. 
setnames(oldpayrolldata, "Hours(Regular)", "RegularHours")
oldpayrolldata$RegularHours <- ifelse(is.na(oldpayrolldata$RegularHours)==T, oldpayrolldata$HoursRegular, oldpayrolldata$RegularHours)

#can we just stick these together and fill the gaps with NAs? hate to drop columns if we don't have to
payroll <- rbind(employees, oldpayrolldata, fill=T)
dim(payroll); dim(employees); dim(oldpayrolldata); nrow(employees)+nrow(oldpayrolldata) #good

setkey(payroll,LastName,FirstName, AgencyName, AgencyStartDate)
payroll <- payroll[!(FirstName=="" | LastName=="" | FirstName=="XXXX" | FirstName=="xxx" | LastName== "xxx"),] #drop the ones with names expunged for privacy
payroll[, employeekey := paste(FirstName, LastName, AgencyName, AgencyStartDate, sep=" ")]
setkey(payroll, employeekey, PayBasis, RegularHours)


hourly <- payroll[PayBasis=="per Hour", ]; dim(hourly) #limit to hourly
setkey(hourly, employeekey, FiscalYear, BaseSalary)
hourly[, obscount := nrow(.SD), by=list(employeekey, FiscalYear)] #now look for dupes again
table(hourly$obscount) #still plenty of duplicates; look at them
duplicated <- hourly[obscount >1, ]; dim(duplicated)
hourly20112017long <- nrow(hourly)

hourly <- hourly[!BaseSalary < 7.25, ]; dim(hourly)
hourly20112017longdrop725 <- nrow(hourly) #okay, this drops a lot of those weird foster grandparent obs, etc. 

hourly[, obscount := nrow(.SD), by=list(employeekey, FiscalYear)] #now look for dupes again
table(hourly$obscount) #still plenty of duplicates; look at them
duplicated <- hourly[obscount >1, ]; dim(duplicated) #still some.

setkey(hourly, "employeekey", "FiscalYear", "BaseSalary") #sort so lowest pay is first
hourly[, obsnumber := 1:nrow(.SD), by=list(employeekey, FiscalYear)] #mark dupes
hourly2 <- hourly[obsnumber==1,] #keep first observation only
nrow(hourly2) - nrow(hourly); dim(duplicated)
hourly <- hourly2
hourly[, obscount := nrow(.SD), by=list(employeekey, FiscalYear)] #now look for dupes again
table(hourly$obscount) #none. 


#okay, now reshape. 
employeeswide <- reshape(hourly, idvar = c("employeekey"), timevar = "FiscalYear", v.names = c("TitleDescription", "LeaveStatus", "BaseSalary", "RegularGrossPaid","RegularHours", "WorkLocationBorough", "MidInit"), direction = "wide")
dim(employeeswide) 
hourly20112017wide <- nrow(employeeswide)
hourly20112017wide_dropundermin <- nrow(employeeswide)

#also-- look at people clearly affected by MW: what do their hours look like?
earlymin <- employeeswide[BaseSalary.2014==7.25,]; dim(earlymin)
summary(earlymin$BaseSalary.2015);
summary(earlymin$RegularHours.2014);summary(earlymin$RegularHours.2015); 
earlymin[,Paid2014:=as.numeric(gsub("$", "", RegularGrossPaid.2014, fixed=T))]
earlymin[,Paid2015:=as.numeric(gsub("$", "", RegularGrossPaid.2015, fixed=T))]
summary(earlymin$Paid2014);summary(earlymin$Paid2015); 

middlenames <- subset(employeeswide, select=c("MidInit.2011", "MidInit.2012", "MidInit.2013", "MidInit.2014", "MidInit.2015", "MidInit.2016", "MidInit.2017"))

employeeswide[!is.na(MidInit.2011), MidInit := MidInit.2011 ] 
employeeswide[!is.na(MidInit.2012), MidInit := MidInit.2012 ] 
employeeswide[!is.na(MidInit.2013), MidInit := MidInit.2013 ] 
employeeswide[!is.na(MidInit.2014), MidInit := MidInit.2014 ] 
employeeswide[!is.na(MidInit.2015), MidInit := MidInit.2015 ] 
employeeswide[!is.na(MidInit.2016), MidInit := MidInit.2016 ] 
employeeswide[!is.na(MidInit.2017), MidInit := MidInit.2017 ] 

#also, clean up very implausible middle initials:
employeeswide[MidInit == "`" | MidInit == "." | MidInit == "=" | MidInit == "/" | MidInit == "-" , MidInit := NA]

#use the  most recent non-missing work borough to break ties?
employeeswide[!is.na(WorkLocationBorough.2014), Borough := WorkLocationBorough.2014]
employeeswide[!is.na(WorkLocationBorough.2015), Borough := WorkLocationBorough.2015]
employeeswide[!is.na(WorkLocationBorough.2016), Borough := WorkLocationBorough.2016]
employeeswide[!is.na(WorkLocationBorough.2017), Borough := WorkLocationBorough.2017]

sort(table(employeeswide$Borough))


##save cleaned employees file
write_feather(employeeswide, "Intermediate Files/NycEmployeesformerge.feather")

##############
#Step 3:The Merge
#########
voters[, middleinitial:= toupper(substr(middlename, 0,1))] #line up format of middle initials before merging on them

##set keys
setkey(voters, lastname, firstname, middleinitial, county)
setkey(employeeswide, LastName, FirstName)

#Full join while ignoring counties
#df <- merge(employeeswide, voters, by.x= c("LastName", "FirstName", "MidInit"), 
#            by.y = c("lastname", "firstname", "middleinitial"), all.x=T)

#employees1 <- employeeswide[MidInit=="", ]
#employees2 <- employeeswide[!(MidInit==""), ]
#dim(employees1) + dim(employees2); dim(employeeswide)

#df1a<- merge(employees1, voters, by.x= c("LastName", "FirstName"), 
#            by.y = c("lastname", "firstname"), all.x=T)
#df1b<- merge(employees2, voters, by.x= c("LastName", "FirstName", "MidInit"), 
#            by.y = c("lastname", "firstname", "middleinitial"), all.x=T)
#df1b[, middleinitial:= MidInit]
#df <- rbind(df1a, df1b)

df <- merge(employeeswide, voters, by.x= c("LastName", "FirstName"), 
            by.y = c("lastname", "firstname"))
voterfilematches <- nrow(df)

df[MidInit == "", MidInit := NA] #first, correct the ones that don't actually have middle initials

##use middle initial as a tie breaker when there are many matches
df[, middlenamemismatch:= 0]
df[!(is.na(MidInit)) & !(is.na(middleinitial)) & middleinitial!= MidInit , middlenamemismatch := 1]
df <- df[!(middlenamemismatch==1), ]

matches_dropmiddlemismatch <- nrow(df)

write_feather(df, "Intermediate Files/Voters And Employees Merged.feather")


#############
###Step 4: Flag Matches to Voters that would have started working as children
#######
df$dob <- as.Date(as.character(df$dob), "%Y%m%d")
df$AgencyStartDate <- as.Date(df$AgencyStartDate, "%M/%d/%Y")
##Legal age of first employment is 14
df$fourteenthBD <- df$dob %m+% years(16)

#Maybe could use 18th BD as a tie breaker
df$eighteenthBD <- df$dob %m+% years(18)

##Apparently these matches actually happen
df$StartUnder14 <- df$AgencyStartDate < df$fourteenthBD
df$StartUnder18 <- df$AgencyStartDate < df$eighteenthBD

sum(df$StartUnder14, na.rm=T)
sum(df$StartUnder18 < df$eighteenthBD, na.rm=T)



#Let's drop matches to people who started under 18 if there's another voter id match that started after
#18
dropUnder18 <- (paste0(df$EmployeeFull, "FALSE") %in% unique(paste0(df$EmployeeFull, df$StartUnder18))) & df$StartUnder18               

dead <- (df$reasoncode == "DEATH" & 
           (as.numeric(df$purge_date) < as.numeric("20140630") & (df$LeaveStatus.2014 == "ACTIVE") |
          (as.numeric(df$purge_date) < as.numeric("20150630") & df$LeaveStatus.2015 == "ACTIVE") |
(as.numeric(df$purge_date) < as.numeric("20160630") & df$LeaveStatus.2016 == "ACTIVE")|
(as.numeric(df$purge_date) < as.numeric("20170630") & df$LeaveStatus.2017 == "ACTIVE") | 
(as.numeric(df$purge_date) < as.numeric("20130630") & df$LeaveStatus.2013 == "ACTIVE")|
(as.numeric(df$purge_date) < as.numeric("20120630") & df$LeaveStatus.2012 == "ACTIVE")|
(as.numeric(df$purge_date) < as.numeric("20110630") & df$LeaveStatus.2011 == "ACTIVE")) )

dead.ind <- which(dead)


write_feather(df, "Intermediate Files/All Employees with Matches to Dead and Children.feather")

##################
###Step 6: Drop matches to dead people and Children
##################
df$idVoters[which(df$StartUnder14)] <- NA
df$voterhistory[which(df$StartUnder14)] <- NA
df$idVoters[dead.ind] <- NA
df$voterhistory[dead.ind] <- NA
df <- df[-which(dead | StartUnder18),] 
matches_dropchildrendead <- nrow(df)

write_feather(df, "Intermediate Files/Voters and Employees Without dead and child matches.feather")

##check a few things
sum(!is.na(df$idVoters[df$dead]))
sum(df$dead, na.rm=T)

sum(!is.na(df$idVoters[df$StartUnder14]))
sum(!is.na(df$idVoters[df$StartUnder18]))

sum(df$StartUnder14, na.rm=T)
sum(df$StartUnder18, na.rm=T)

dim(df) #better, still a lot of duplicates
######################################
###Step 7: Break Ties With Counties
###################################
df[Borough=="", Borough:= NA] #drop those blanks so they don't mismatch
df$countyMatch <- df$Borough == df$county

#when there are duplicate obs for an employee, and that employee has a work borough location, 
#then we should privilege matches where county matches borough (and drop other matches). but if any of those aren't true, we'll just keep all the matches
df[, obscount := nrow(.SD), by=list(employeekey)]
df[, employeehasboroughmatch := max(countyMatch), by=list(employeekey)] #mark if there is a row with countymatch
df[, employeehasborough := !is.na(.SD$Borough), by=list(employeekey)]
df[, countytiebreak := 0]
df[(employeehasborough & employeehasboroughmatch & obscount > 1)  & countyMatch==F, countytiebreak := 1] #mark rows to drop

#df <- df[countytiebreak==0, ] #actually, not using this in this version: not sure about assumption people don't travel across boroughs for work
write_feather(df, "Voters and Employees, Dropped Dead and Children, Counties as Tie Breakers.feather")

###################################
###Step 8: Break Ties With Age
###############################
#It seems  unlikely the elderly are working for the city. If there's a duplicate match, we can prefer the one
#that's more in typical working age
df$Age <- as.Date("2014-06-30", "%Y-%m-%d") - df$dob
df$Over70 <- df$Age/365 >70 #& df$LeaveStatusasofJune30 == "ACTIVE"
df[, obscount := nrow(.SD), by=list(employeekey)]
df[, matchesover70 := sum(.SD$Over70), by=list(employeekey)] 
df[, agetiebreak:= 0]
df[(obscount > 1 & (matchesover70 < obscount) & Over70==T), agetiebreak:= 1] #mark the ones to drop 
table(df$agetiebreak) #drops about 125k
df <- df[agetiebreak==0, ]; dim(df)
matches_agetiebreak <- nrow(df)

write_feather(df, "Voters and Employees, Dropped Dead and Children, Counties and Age as Tie Breakers_precollapse.feather")

####now clean up voting, collapse any further voting duplicates, stick it back to our baseline dataset (employeeswide) to ensure we haven't lost anyone, and save.
df[, observations:= .N, by=list(employeekey)]
table(df$observations)
tons <- df[observations==87,]; head(tons) 
df_precollapse <- df

#want to collapse further duplicates by averaging across voting records
df.hourly <- df
df.hourly[, observations:= .N, by=list(employeekey)]
table(df.hourly$observations)

setkey(df.hourly, employeekey)
nrow(df); length(unique(df$idVoters))

#also, quickly figure out who was registered before 2012 election.
df.hourly[, datereg := ymd(regdate)]
df.hourly[, regby2012 := 0]; df.hourly[datereg <= "2012-10-12", regby2012 := 1] #this is the deadline for general election voting 2012. 
#also grab gender/age so we can say some things about our matches (in aggregate)
df.hourly[, male:=NA]; df.hourly[gender=="M", male:=1]; df.hourly[gender=="F", male:=0]; 
df.hourly[, age2016:=NA];df.hourly[, birthday := as.Date(dob, "%Y%m%d")]
electionday2016 <- as.Date("2016-11-08")
df.hourly[, age2016 := as.numeric(electionday2016 - birthday)/365.25] 

#binomial draws
votes <- df.hourly[,list(Voted17=rbinom(1,1, p=mean(voted17)),Voted16=rbinom(1,1, p=mean(voted16)), Voted14=rbinom(1,1, p=mean(voted14)),Voted13=rbinom(1,1, p=mean(voted13)), Voted12=rbinom(1,1, p=mean(voted12)), Voted10=rbinom(1,1, p=mean(voted10)), Voted08=rbinom(1,1, p=mean(voted08)), Voted06=rbinom(1,1, p=mean(voted06)), Voted04=rbinom(1,1, p=mean(voted04)), Voted02=rbinom(1,1, p=mean(voted02)), proportionregby2012 = mean(regby2012), proportionmale = mean(male, na.rm=T), meanagein2016 = mean(age2016, na.rm=T), numrecords = nrow(.SD)), by=list(employeekey)]
votes[, mergedtovoterfile:=1]
collapsedmatches <- nrow(votes)

hourlyfinal <- merge(employeeswide, votes, by=c("employeekey"), all.x=T)
dim(hourlyfinal); dim(employeeswide); dim(votes)

#recode missing is not voted
hourlyfinal$Voted17[is.na(hourlyfinal$Voted17)] <- 0
hourlyfinal$Voted16[is.na(hourlyfinal$Voted16)] <- 0
hourlyfinal$Voted14[is.na(hourlyfinal$Voted14)] <- 0
hourlyfinal$Voted13[is.na(hourlyfinal$Voted13)] <- 0
hourlyfinal$Voted12[is.na(hourlyfinal$Voted12)] <- 0
hourlyfinal$Voted10[is.na(hourlyfinal$Voted10)] <- 0
hourlyfinal$Voted08[is.na(hourlyfinal$Voted08)] <- 0
hourlyfinal$Voted06[is.na(hourlyfinal$Voted06)] <- 0
hourlyfinal$Voted04[is.na(hourlyfinal$Voted04)] <- 0
hourlyfinal$Voted02[is.na(hourlyfinal$Voted02)] <- 0
hourlyfinal$Voted00[is.na(hourlyfinal$Voted00)] <- 0
summary(hourlyfinal$Voted16)
percentagematchedtoVF <- round(100*(sum(hourlyfinal$mergedtovoterfile, na.rm=T) / nrow(hourlyfinal)))


##This is the file used for our main analysis
write_feather(hourlyfinal, "Voters and Employees, Dropped Dead and Children, Counties and Age as Tie Breakers_newmerge2018file.feather") 

c(hourly20112017long,hourly20112017longdrop725, hourly20112017wide, hourly20112017wide_dropundermin, voterfilerows, voterfilematches, matches_dropmiddlemismatch,matches_dropchildrendead,  matches_boroughtiebreak, matches_agetiebreak, collapsedmatches, percentagematchedtoVF) 
#print them all out for ease of adding to paper. this is the order in which they appear in the header of the main paper TeX file. 
